import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objs as go
train = pd.read_csv("./data/train.csv")
test = pd.read_csv("./data/test.csv")
sub = pd.read_csv("./data/gender_submission.csv")
plt.figure(figsize=(10,7))
sns.heatmap(train.isnull(), yticklabels=False, cbar=False) # cbar : colorbar를 그리지 않음.
<Axes: >
plt.figure(figsize=(10,7))
sns.heatmap(test.isnull(), yticklabels=False, cbar=False) # cbar : colorbar를 그리지 않음.
<Axes: >
sns.set_style('whitegrid')
sns.countplot(x='Survived', data=train)
<Axes: xlabel='Survived', ylabel='count'>
# 생존자와 사망자 수 계산
survival_counts = train['Survived'].value_counts()
# Plotly 파이 차트 시각화
labels = ['Died', 'Survived']
values = [survival_counts[0], survival_counts[1]]
fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=0.3,
marker=dict(colors=['salmon', 'skyblue']))])
fig.update_layout(title_text='Survival Rates on Titanic', title_x=0.5)
fig.show()
# 성별에 따른 생존율
plt.figure(figsize=(8, 6))
sns.countplot(data=train, x='Sex', hue='Survived', palette=['salmon', 'skyblue'])
plt.title('Survival Count by Gender')
plt.xlabel('gender')
plt.ylabel('Count')
plt.legend(['Died', 'Survived'])
plt.show()
# 나이 분포와 생존율
plt.figure(figsize=(10, 8))
sns.histplot(data=train, x='Age', hue='Survived', multiple='stack', palette=['salmon', 'skyblue'])
plt.title('Age Distribution by Survival')
plt.xlabel('Age')
plt.ylabel('Count')
plt.show()
# 객실 등급별 생존율
plt.figure(figsize=(8, 6))
sns.countplot(data=train, x='Pclass', hue='Survived', palette=['salmon', 'skyblue'])
plt.title('Survival Count by Pclass')
plt.xlabel('Pclass')
plt.ylabel('Count')
plt.legend(['Died', 'Survived'])
plt.show()
# 승선 항구별 생존율
plt.figure(figsize=(8, 6))
sns.countplot(data=train, x='Embarked', hue='Survived', palette=['salmon', 'skyblue'])
plt.title('Survival Count by Embarked')
plt.xlabel('Embarked')
plt.ylabel('Count')
plt.legend(['Died', 'Survived'])
plt.show()
# 가족 구성 여부에 따른 생존율
train['FamilySize'] = train['SibSp'] + train['Parch']
plt.figure(figsize=(10, 8))
sns.countplot(data=train, x='FamilySize', hue='Survived', palette=['salmon', 'skyblue'])
plt.title('Survival Count by Family Size')
plt.xlabel('Family Size')
plt.ylabel('Count')
plt.legend(['Died', 'Survived'])
plt.show()
f,ax=plt.subplots(1,2,figsize=(18,8))
# 첫번째 그래프
sns.histplot(train['Age'].dropna(), bins=30,ax=ax[0])
ax[0].set_title('train - Age')
# 두번째 그래프
sns.histplot(test['Age'].dropna(), bins=30,ax=ax[1])
ax[1].set_title('test - Age')
plt.show()
f,ax=plt.subplots(1,2,figsize=(18,8))
# 첫번째 그래프
plt.figure(figsize=(10, 8))
sns.histplot(data=train, x='Age', hue='Survived', multiple='stack', palette=['salmon', 'skyblue'],ax=ax[0])
ax[0].set_title('Age Distribution by Survival')
ax[0].set_xlabel('Age')
ax[0].set_ylabel('Count')
# 두번째 그래프
sns.histplot(test['Age'].dropna(), bins=30,ax=ax[1])
ax[1].set_title('test - Age')
plt.show()
<Figure size 1000x800 with 0 Axes>
train['Age'] = train['Age'].fillna(train['Age'].mean())
test['Age'] = test['Age'].fillna(test['Age'].mean())
print(train.isnull().sum())
print(test.isnull().sum())
PassengerId 0 Survived 0 Pclass 0 Name 0 Sex 0 Age 0 SibSp 0 Parch 0 Ticket 0 Fare 0 Cabin 687 Embarked 2 FamilySize 0 dtype: int64 PassengerId 0 Pclass 0 Name 0 Sex 0 Age 0 SibSp 0 Parch 0 Ticket 0 Fare 1 Cabin 327 Embarked 0 dtype: int64
val_Embarked = train['Embarked'].value_counts()
val_Embarked
Embarked S 644 C 168 Q 77 Name: count, dtype: int64
train['Embarked'] = train['Embarked'].fillna('S')
test['Fare'] = test['Fare'].fillna(test['Fare'].mean())
print(train.isnull().sum())
print(test.isnull().sum())
PassengerId 0 Survived 0 Pclass 0 Name 0 Sex 0 Age 0 SibSp 0 Parch 0 Ticket 0 Fare 0 Cabin 687 Embarked 0 FamilySize 0 dtype: int64 PassengerId 0 Pclass 0 Name 0 Sex 0 Age 0 SibSp 0 Parch 0 Ticket 0 Fare 0 Cabin 327 Embarked 0 dtype: int64
print( train['Sex'].value_counts() )
print( train['Embarked'].value_counts() )
Sex male 577 female 314 Name: count, dtype: int64 Embarked S 646 C 168 Q 77 Name: count, dtype: int64
train['Sex'] = train['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
test['Sex'] = test['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
train['Embarked'] = train['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
test['Embarked']= test['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
## 나이에 대한 int 처리
train['Age'] = train['Age'].astype('int')
test['Age'] = test['Age'].astype('int')
print(train.columns)
print(train.info())
print()
print(test.info())
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'FamilySize'],
dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 PassengerId 891 non-null int64
1 Survived 891 non-null int64
2 Pclass 891 non-null int64
3 Name 891 non-null object
4 Sex 891 non-null int32
5 Age 891 non-null int32
6 SibSp 891 non-null int64
7 Parch 891 non-null int64
8 Ticket 891 non-null object
9 Fare 891 non-null float64
10 Cabin 204 non-null object
11 Embarked 891 non-null int32
12 FamilySize 891 non-null int64
dtypes: float64(1), int32(3), int64(6), object(3)
memory usage: 80.2+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 PassengerId 418 non-null int64
1 Pclass 418 non-null int64
2 Name 418 non-null object
3 Sex 418 non-null int32
4 Age 418 non-null int32
5 SibSp 418 non-null int64
6 Parch 418 non-null int64
7 Ticket 418 non-null object
8 Fare 418 non-null float64
9 Cabin 91 non-null object
10 Embarked 418 non-null int32
dtypes: float64(1), int32(3), int64(4), object(3)
memory usage: 31.2+ KB
None
# 사용할 특징과 예측할 타겟. 상관계수 확인
sel_XY = ['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'SibSp','Parch', 'Embarked', 'Survived']
train_all = train[sel_XY]
print( train_all.columns )
corr_XY = train_all.corr()
corr_XY
Index(['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'SibSp', 'Parch',
'Embarked', 'Survived'],
dtype='object')
| PassengerId | Pclass | Sex | Age | SibSp | SibSp | Parch | Embarked | Survived | |
|---|---|---|---|---|---|---|---|---|---|
| PassengerId | 1.000000 | -0.035144 | 0.042939 | 0.033741 | -0.057527 | -0.057527 | -0.001652 | -0.030467 | -0.005007 |
| Pclass | -0.035144 | 1.000000 | 0.131900 | -0.335071 | 0.083081 | 0.083081 | 0.018443 | 0.045702 | -0.338481 |
| Sex | 0.042939 | 0.131900 | 1.000000 | 0.082533 | -0.114631 | -0.114631 | -0.245489 | -0.116569 | -0.543351 |
| Age | 0.033741 | -0.335071 | 0.082533 | 1.000000 | -0.232743 | -0.232743 | -0.176744 | 0.000234 | -0.067809 |
| SibSp | -0.057527 | 0.083081 | -0.114631 | -0.232743 | 1.000000 | 1.000000 | 0.414838 | -0.059961 | -0.035322 |
| SibSp | -0.057527 | 0.083081 | -0.114631 | -0.232743 | 1.000000 | 1.000000 | 0.414838 | -0.059961 | -0.035322 |
| Parch | -0.001652 | 0.018443 | -0.245489 | -0.176744 | 0.414838 | 0.414838 | 1.000000 | -0.078665 | 0.081629 |
| Embarked | -0.030467 | 0.045702 | -0.116569 | 0.000234 | -0.059961 | -0.059961 | -0.078665 | 1.000000 | 0.106811 |
| Survived | -0.005007 | -0.338481 | -0.543351 | -0.067809 | -0.035322 | -0.035322 | 0.081629 | 0.106811 | 1.000000 |
colormap = plt.cm.RdBu
plt.figure(figsize=(14, 12))
plt.title('Pearson Correlation of Features', y=1.05, size=15)
sns.heatmap(corr_XY, linewidths=0.1, vmax=1.0,
square=True, cmap=colormap, linecolor='white', annot=True, annot_kws={"size": 16})
<Axes: title={'center': 'Pearson Correlation of Features'}>
from sklearn.model_selection import train_test_split
# 'Name', 'Ticket' => 문자포함
sel = ['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'SibSp','Parch', 'Embarked' ]
# 학습에 사용될 데이터 준비 X_train, y_train
X_train_all = train[sel]
y_train_all = train['Survived']
# 데이터를 학습 세트와 테스트 세트로 분할 train(70%), test(30%)
X_train, X_test, y_train, y_test = train_test_split(X_train_all, y_train_all, test_size=0.3, random_state=42)
# 분할된 데이터 출력
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)
X_train shape: (623, 8) X_test shape: (268, 8) y_train shape: (623,) y_test shape: (268,)
# 최종 예측을 위한 test 데이터
X_test_last = test[sel]
from sklearn.neighbors import KNeighborsClassifier
# 모델 구축 및 학습
model = KNeighborsClassifier()
model.fit(X_train, y_train)
pred = model.predict(X_test)
pred
array([0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0,
0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0,
0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1,
0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
0, 1, 0, 1], dtype=int64)
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# 정확도 계산
accuracy = accuracy_score(y_test, pred)
print(f'Accuracy: {accuracy:.2f}')
Accuracy: 0.54
# 혼동 행렬 계산 및 시각화
conf_matrix = confusion_matrix(y_test, pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True,
fmt='d', cmap='Blues', xticklabels=['Died', 'Survived'], yticklabels=['Died', 'Survived'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# 모델 리스트
models = {
"KNN": KNeighborsClassifier(n_neighbors=5),
"Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
"Logistic Regression": LogisticRegression()
}
# 결과 저장을 위한 리스트
results = []
# 각 모델에 대해 학습, 예측 및 평가 수행
for name, model in models.items():
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
results.append([name, accuracy, precision, recall, f1])
# 결과 데이터프레임 생성
results_df = pd.DataFrame(results, columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
print(results_df)
Model Accuracy Precision Recall F1 Score 0 KNN 0.541045 0.416667 0.270270 0.327869 1 Random Forest 0.817164 0.844444 0.684685 0.756219 2 Logistic Regression 0.794776 0.797872 0.675676 0.731707
C:\ProgramData\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py:460: ConvergenceWarning:
lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
from sklearn.ensemble import RandomForestClassifier
# 모델 구축
model = LogisticRegression()
# 학습
model.fit(X_train_all, y_train_all)
# 예측
y_pred = model.predict(X_test_last)
sub['Survived'] = y_pred
sub.head()
C:\ProgramData\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py:460: ConvergenceWarning:
lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
| PassengerId | Survived | |
|---|---|---|
| 0 | 892 | 0 |
| 1 | 893 | 1 |
| 2 | 894 | 0 |
| 3 | 895 | 0 |
| 4 | 896 | 1 |
sub.to_csv("second_rf_model.csv", index=False)